Precision Analytics Intermediate Code Challenge

Descriptive Statistics of Bechdel dataset

#Do we have missing data?
bech %>% 
  mutate(across(.cols = -c(imdb,code,binary), ~as.numeric(.x))) %>% 
  na.omit(.)
## # A tibble: 1,484 x 10
##     year imdb         budget  domgross  intgross code     budget_2013 domgross_2013
##    <dbl> <chr>         <dbl>     <dbl>     <dbl> <chr>          <dbl>         <dbl>
##  1  1974 tt0071562  13000000  57300000  57300000 1974PASS    61408439     270669505
##  2  1982 tt0084516  10700000  74706019 121706019 1982PASS    25821968     180285645
##  3  2008 tt0800241  15000000   2203641   6379575 2008PASS    16233845       2384904
##  4  2011 tt1625346  12000000  16311571  22750356 2011PASS    12428289      16893744
##  5  2000 tt0190590  26000000  45506619  75763814 2000FAIL    35175618      61566286
##  6  2009 tt1232207  20000000  14363397  19121531 2009FAIL    21714632      15594794
##  7  2007 tt0449088 300000000 309420425 960996492 2007PASS   337063045     347647302
##  8  2009 tt0875034  80000000  19676965  53508858 2009FAIL    86858528      21363903
##  9  2005 tt0369994   2000000   2072645   2077844 2005PASS     2386066       2472734
## 10  2002 tt0283139  16000000  16357770  21657770 2002PASS    20722867      21186244
## # ... with 1,474 more rows, and 2 more variables: intgross_2013 <dbl>,
## #   binary <chr>
#1,484 rows have no NA or missing values. Drop the others.

bech = bech %>% 
  mutate(across(.cols = -c(imdb,code,binary), ~as.numeric(.x))) %>% 
  na.omit(.)

bech %>% 
  count(binary)
## # A tibble: 2 x 2
##   binary     n
##   <chr>  <int>
## 1 FAIL     820
## 2 PASS     664
#820 FAILS, 664 PASSES.

#Summary.
summary(bech)
##       year          imdb               budget             domgross        
##  Min.   :1970   Length:1484        Min.   :     7000   Min.   :      828  
##  1st Qu.:1998   Class :character   1st Qu.: 12000000   1st Qu.: 16244304  
##  Median :2005   Mode  :character   Median : 29000000   Median : 42704878  
##  Mean   :2003                      Mean   : 45555379   Mean   : 70440928  
##  3rd Qu.:2009                      3rd Qu.: 63000000   3rd Qu.: 95052518  
##  Max.   :2013                      Max.   :425000000   Max.   :760507625  
##     intgross             code            budget_2013        domgross_2013      
##  Min.   :8.280e+02   Length:1484        Min.   :     8632   Min.   :8.990e+02  
##  1st Qu.:2.583e+07   Class :character   1st Qu.: 16143738   1st Qu.:2.022e+07  
##  Median :7.875e+07   Mode  :character   Median : 36995786   Median :5.608e+07  
##  Mean   :1.545e+08                      Mean   : 56311299   Mean   :9.719e+07  
##  3rd Qu.:1.986e+08                      3rd Qu.: 81684071   3rd Qu.:1.255e+08  
##  Max.   :2.784e+09                      Max.   :461435929   Max.   :1.772e+09  
##  intgross_2013          binary         
##  Min.   :8.990e+02   Length:1484       
##  1st Qu.:3.278e+07   Class :character  
##  Median :9.933e+07   Mode  :character  
##  Mean   :2.037e+08                     
##  3rd Qu.:2.467e+08                     
##  Max.   :3.172e+09
#Simple bar plot for pass/fail.
ggplotly(
  bech %>% 
  count(binary) %>%  
  mutate(bech_percent = n / sum(n)) %>% 
  ggplot() + 
  geom_col(aes(x = binary, y = bech_percent, fill = binary)) + 
  scale_y_continuous(limits = c(0, 1), 
                     labels = label_percent()) + 
  labs(y = "Bechdel Test Proportion (%)",
       x = "Bechdel Test Result",
       subtitle = "Bechdel Test Results for 1500 IMDB records (1970 - 2013)",
       title = "Lack of Real Portrayal of Women in Fiction") + 
  scale_fill_brewer(palette = "Set1") + 
  theme(legend.position = "none")
)
#Does Bechdel Test result change over time?
bech %>% 
  ggplot(aes(year, fill = binary, group = binary)) +
  geom_density(position="fill") + 
  labs(title = "Portayal of Women in Fiction",
       subtitle = "Some Improvement in 40 Years",
       x = "Year", 
       y = 'Proportion Pass/Fail',
       fill = "Bechdel Test") + 
  scale_y_continuous(labels = percent_format())

Variable Distribution Exploration

#Distribution of predictor variables.
bech %>% 
  select(-code) %>% 
  pivot_longer(cols = -c(year,imdb,binary), names_to = "Variables") %>% 
  ggplot() + 
  geom_histogram(aes(x = year, group = Variables, fill = Variables), bins = 43) + 
  facet_wrap(~ Variables) + 
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
  labs(title = "Skewed Distributions")

#Predictors 
bech %>% 
  select(-code) %>% 
  pivot_longer(cols = -c(year,imdb,binary), names_to = "Variables") %>% 
  mutate(value = log(value)) %>% 
  ggplot() + 
  geom_boxplot(aes(y = value, x = binary, group = binary, fill = binary)) + 
  facet_wrap( ~ Variables, scales = "free") + 
  labs(y = "US Dollars (logged)",
       x = "Bechdel Test Result",
       title = "More Money, Less Representation?",
       subtitle = "Possible Correlation with Bechdel Test Failure and Money") + 
  theme(legend.position = "none") + 
  scale_y_continuous(labels = dollar_format())

Model Variable Selection and Model Testing

GLM

#require(caret)
# Splitting the data into train and test
moddat = bech %>% 
  select(-imdb, -code) %>% 
  mutate(binary = ifelse(binary == "PASS", 1, 0))

# Training the model
logmodel <- glm(binary ~ ., family = binomial, moddat)

# Check out the model summary
summary(logmodel)
## 
## Call:
## glm(formula = binary ~ ., family = binomial, data = moddat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4367  -1.1293  -0.8252   1.1676   2.3270  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.002e+01  2.072e+01   0.484 0.628701    
## year          -4.920e-03  1.034e-02  -0.476 0.634124    
## budget         1.048e-08  8.600e-09   1.219 0.222930    
## domgross       2.368e-08  7.173e-09   3.301 0.000964 ***
## intgross      -9.630e-09  3.515e-09  -2.740 0.006145 ** 
## budget_2013   -1.594e-08  7.183e-09  -2.219 0.026468 *  
## domgross_2013 -1.895e-08  5.531e-09  -3.427 0.000611 ***
## intgross_2013  8.216e-09  2.774e-09   2.961 0.003063 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2040.8  on 1483  degrees of freedom
## Residual deviance: 1977.3  on 1476  degrees of freedom
## AIC: 1993.3
## 
## Number of Fisher Scoring iterations: 5
broom::tidy(logmodel)
## # A tibble: 8 x 5
##   term          estimate std.error statistic  p.value
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    1.00e+1   2.07e+1     0.484 0.629   
## 2 year          -4.92e-3   1.03e-2    -0.476 0.634   
## 3 budget         1.05e-8   8.60e-9     1.22  0.223   
## 4 domgross       2.37e-8   7.17e-9     3.30  0.000964
## 5 intgross      -9.63e-9   3.51e-9    -2.74  0.00615 
## 6 budget_2013   -1.59e-8   7.18e-9    -2.22  0.0265  
## 7 domgross_2013 -1.90e-8   5.53e-9    -3.43  0.000611
## 8 intgross_2013  8.22e-9   2.77e-9     2.96  0.00306
moddat$predicted_glm <- ifelse(logmodel$fitted.values >= 0.5, "PASS", "FAIL")

#How accurate is our model? 
results_tab = table(moddat$binary, moddat$predicted_glm)
acc = sum(diag(results_tab))/sum(results_tab)*100
print(paste0("Accuracy: ",round(acc,2),"%"))
## [1] "Accuracy: 58.36%"
roc <- roc(moddat$binary, logmodel$fitted.values)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste0("AUC: ",round(auc(roc), 4)))
## [1] "AUC: 0.6162"

GLM just with 2013 variables

# Training the model
logmodel_2013 <- glm(binary ~ budget_2013 + domgross_2013 + intgross_2013, family = binomial, moddat)

# Check out the model summary
summary(logmodel_2013)
## 
## Call:
## glm(formula = binary ~ budget_2013 + domgross_2013 + intgross_2013, 
##     family = binomial, data = moddat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3346  -1.1135  -0.8468   1.1904   1.8752  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.899e-01  8.118e-02   2.340 0.019296 *  
## budget_2013   -6.356e-09  1.324e-09  -4.799 1.59e-06 ***
## domgross_2013 -4.267e-09  1.248e-09  -3.419 0.000628 ***
## intgross_2013  1.769e-09  5.870e-10   3.014 0.002576 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2040.8  on 1483  degrees of freedom
## Residual deviance: 1997.5  on 1480  degrees of freedom
## AIC: 2005.5
## 
## Number of Fisher Scoring iterations: 4
broom::tidy(logmodel_2013)
## # A tibble: 4 x 5
##   term          estimate std.error statistic    p.value
##   <chr>            <dbl>     <dbl>     <dbl>      <dbl>
## 1 (Intercept)    1.90e-1  8.12e- 2      2.34 0.0193    
## 2 budget_2013   -6.36e-9  1.32e- 9     -4.80 0.00000159
## 3 domgross_2013 -4.27e-9  1.25e- 9     -3.42 0.000628  
## 4 intgross_2013  1.77e-9  5.87e-10      3.01 0.00258
moddat$predicted_glm_2013 <- ifelse(logmodel_2013$fitted.values >= 0.5, "PASS", "FAIL")

#How accurate is our model? 
results_tab = table(moddat$binary, moddat$predicted_glm_2013)
acc = sum(diag(results_tab))/sum(results_tab)*100
print(paste0("Accuracy: ",round(acc,2),"%"))
## [1] "Accuracy: 56.27%"
roc <- roc(moddat$binary, logmodel_2013$fitted.values)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste0("AUC: ",round(auc(roc), 4)))
## [1] "AUC: 0.5955"

GLM with stepAIC

logmodel_aic = MASS::stepAIC(logmodel)
## Start:  AIC=1993.29
## binary ~ year + budget + domgross + intgross + budget_2013 + 
##     domgross_2013 + intgross_2013
## 
##                 Df Deviance    AIC
## - year           1   1977.5 1991.5
## - budget         1   1978.8 1992.8
## <none>               1977.3 1993.3
## - budget_2013    1   1982.6 1996.6
## - intgross       1   1985.8 1999.8
## - intgross_2013  1   1987.4 2001.4
## - domgross       1   1991.0 2005.0
## - domgross_2013  1   1993.4 2007.4
## 
## Step:  AIC=1991.52
## binary ~ budget + domgross + intgross + budget_2013 + domgross_2013 + 
##     intgross_2013
## 
##                 Df Deviance    AIC
## - budget         1   1978.9 1990.9
## <none>               1977.5 1991.5
## - budget_2013    1   1983.6 1995.6
## - intgross       1   1986.1 1998.1
## - intgross_2013  1   1987.8 1999.8
## - domgross       1   1992.5 2004.5
## - domgross_2013  1   1995.7 2007.7
## 
## Step:  AIC=1990.9
## binary ~ domgross + intgross + budget_2013 + domgross_2013 + 
##     intgross_2013
## 
##                 Df Deviance    AIC
## <none>               1978.9 1990.9
## - intgross       1   1986.1 1996.1
## - intgross_2013  1   1987.8 1997.8
## - domgross       1   1992.9 2002.9
## - domgross_2013  1   1996.0 2006.0
## - budget_2013    1   2007.1 2017.1
#What if we used a stepAIC'd model?
moddat$predicted_glm_aic <- ifelse(logmodel_aic$fitted.values >= 0.5, "PASS", "FAIL")

#How accurate is our model? 
results_tab = table(moddat$binary, moddat$predicted_glm_aic)
acc = sum(diag(results_tab))/sum(results_tab)*100
print(paste0("Accuracy: ",round(acc,2),"%"))
## [1] "Accuracy: 58.89%"
roc <- roc(moddat$binary, logmodel_aic$fitted.values)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste0("AUC: ",round(auc(roc), 4)))
## [1] "AUC: 0.6173"

Mixed Model

library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
mmodel = lmer(binary ~ budget_2013 + domgross_2013 + intgross_2013 + (1 | year), data = moddat)

summary(mmodel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: binary ~ budget_2013 + domgross_2013 + intgross_2013 + (1 | year)
##    Data: moddat
## 
## REML criterion at convergence: 2230
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1841 -0.9370 -0.6088  1.0360  1.7887 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  year     (Intercept) 0.00202  0.04494 
##  Residual             0.23897  0.48885 
## Number of obs: 1484, groups:  year, 44
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)    5.396e-01  2.154e-02  25.056
## budget_2013   -1.504e-09  2.968e-10  -5.069
## domgross_2013 -9.541e-10  2.771e-10  -3.443
## intgross_2013  4.015e-10  1.345e-10   2.985
## 
## Correlation of Fixed Effects:
##             (Intr) b_2013 d_2013
## budget_2013 -0.499              
## dmgrss_2013 -0.348  0.288       
## intgrs_2013  0.280 -0.495 -0.915
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
moddat$predicted_mmodel =  predict(mmodel, re.form = NA)
moddat$predicted_mmodel <- ifelse(moddat$predicted_mmodel >= 0.5, "PASS", "FAIL")

#How accurate is our model? 
results_tab = table(moddat$binary, moddat$predicted_mmodel)
acc = sum(diag(results_tab))/sum(results_tab)*100
print(paste0("Accuracy: ",round(acc,2),"%"))
## [1] "Accuracy: 56%"

Looks like the step AIC model is the best… Let’s use it.

Predict for Test Dataset!

test = read.csv('Test.csv') %>% 
  as_tibble(.)

test = test %>% 
  mutate(across(-c(imdb,code), ~as.numeric(.x))) %>% 
  filter(!is.na(domgross))

test$binary_predicted = predict(logmodel_aic, test, type = "response")

test = test %>% 
  mutate(binary_predicted = ifelse(binary_predicted > 0.5, "PASS", "FAIL"))

test %>% 
  count(binary_predicted)
## # A tibble: 2 x 2
##   binary_predicted     n
##   <chr>            <int>
## 1 FAIL               189
## 2 PASS               103
test %>% 
  count(binary_predicted) %>%  
  mutate(bech_percent = n / sum(n)) %>% 
  ggplot() + 
  geom_col(aes(x = binary_predicted, y = bech_percent, fill = binary_predicted)) + 
  scale_y_continuous(limits = c(0, 1), 
                     labels = label_percent()) + 
  labs(y = "Predicted Bechdel Test Proportion (%)",
       x = "Bechdel Test Result",
       subtitle = "Predicted Bechdel Test Results for 1500 IMDB records (1970 - 2013)",
       title = "Lack of Real Portrayal of Women in Fiction") + 
  scale_fill_brewer(palette = "Set1") + 
  theme(legend.position = "none")

#Does Bechdel Test result change over time?
test %>% 
  ggplot(aes(year, fill = binary_predicted, group = binary_predicted)) +
  geom_density(position="fill") + 
  labs(title = "Predicted Portayal of Women in Fiction",
       subtitle = "Some Improvement in 40 Years?",
       x = "Year", 
       y = 'Predicted Proportion Pass/Fail',
       fill = "Bechdel Test") + 
  scale_y_continuous(labels = percent_format())

write.csv(test, 'test_dataset_with_predictions.csv', row.names = F)